More Classification: ISL 4

DJM

3 April 2018

Linear classifiers

Logistic regression

g <- ggplot(df, aes(X1,X2,color=y)) + geom_point() +
  scale_color_manual(values=c(blue,red))
g

log.lm = glm(y~.,data=df, family='binomial')
summary(log.lm)
## 
## Call:
## glm(formula = y ~ ., family = "binomial", data = df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.63909  -0.24256  -0.03246  -0.00061   2.35104  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.803      1.010  -3.764 0.000167 ***
## X1            -6.593      1.918  -3.438 0.000586 ***
## X2             7.271      1.945   3.739 0.000185 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 107.855  on 99  degrees of freedom
## Residual deviance:  36.785  on 97  degrees of freedom
## AIC: 42.785
## 
## Number of Fisher Scoring iterations: 8

What is the line?

cc = coefficients(log.lm)
g + geom_abline(intercept = -cc[1]/cc[3], slope = -cc[2]/cc[3], color=green)

Classification boundary

ggplot(data.frame(x=c(-10,10)), aes(x)) + stat_function(fun=ilogit) + 
  geom_vline(xintercept = 0, color=red) + ylab('probability')

Lots of different boundaries

## # A tibble: 4 x 3
## # Groups: index [4]
##   index intercept  slope
##   <chr>     <dbl>  <dbl>
## 1 a        0.0907 -0.393
## 2 b        0.0905 -0.373
## 3 c        0.207  -3.67 
## 4 d        7.96    7.11

Linear discriminant analysis

LDA

(Not to be confused with Latent Dirichlet Allocation, also abbrev. LDA)

Bayes Rule: \[ P(Y_i=1 {\ \vert\ }X_i=x) = \frac{P(X_i {\ \vert\ }Y_i=1)\pi_1} {P(X_i {\ \vert\ }Y_i=1)\pi_1 + P(X_i {\ \vert\ }Y_i=0)\pi_0} \]

Simplification

\[ \begin{aligned} \frac{P(X_i {\ \vert\ }Y_i=1)\pi_1} {P(X_i {\ \vert\ }Y_i=1)\pi_1 + P(X_i {\ \vert\ }Y_i=0)\pi_0} &= \frac{\pi_1 \frac{1}{(2\pi|\Sigma|)^{p/2}}\exp\left(-\frac{1}{2}(x-\mu_1)^\top \Sigma^{-1} (x-\mu_1) \right)} {\sum_{j=0,1}\pi_j\frac{1}{(2\pi|\Sigma|)^{p/2}}\exp\left(-\frac{1}{2}(x-\mu_j)^\top \Sigma^{-1} (x-\mu_j) \right)}\\ &= (\textrm{take logs})\\ &= \ldots\\ &=x^\top\Sigma^{-1}\mu_1-\frac{1}{2}\mu_1^\top \Sigma^{-1}\mu_1 + \log\pi_1\\ &=: \delta_1 \end{aligned} \]

Why is this linear?

\[ \begin{aligned} &\delta_1=\delta_0\\ &\Rightarrow x^\top\Sigma^{-1}\mu_1-\frac{1}{2}\mu_1^\top \Sigma^{-1}\mu_1 + \log\pi_1 = x^\top\Sigma^{-1}\mu_0-\frac{1}{2}\mu_0^\top \Sigma^{-1}\mu_0 + \log\pi_0\\ &\Rightarrow x^\top\Sigma^{-1}(\mu_1-\mu_0) -\frac{1}{2}\left(\mu_1^\top \Sigma^{-1}\mu_1 - \mu_0^\top \Sigma^{-1}\mu_0\right) + \log\pi_1-\log\pi_0 = 0 \end{aligned} \]

Example

library(mvtnorm)
n = 100
pi1 = 0.5
n1 = floor(n*pi1); n0 = n-n1
mu1 = c(1,2); mu0 = c(-1,-1)
Sigma = 2*diag(2)
X1 = rmvnorm(n1, mu1, Sigma) 
X2 = rmvnorm(n0, mu0, Sigma)
X = rbind(X1,X2)
Y = factor(c(rep(1,n1),rep(0,n0)))
df = data.frame(Y,X)
g <- ggplot(df, aes(X1,X2,color=Y)) + geom_point() + scale_color_manual(values=c(blue,red))
g

The distributions and the classifier

Sinv = solve(Sigma)
slope.vec = t(mu1-mu0) %*% Sinv
intercept = 0.5*(t(mu0) %*% Sinv %*% mu0 - t(mu1) %*% Sinv %*% mu1)
g + stat_ellipse(type='norm') + # these are estimated, not the truth
  geom_abline(intercept = -intercept/slope.vec[2], 
              slope = -slope.vec[1]/slope.vec[2], color=green)

Try another one

mu1 = c(1,2); mu0 = c(1,-1)
Sigma = 2*matrix(c(1,-.5,-.5,1),2)
X1 = rmvnorm(n1, mu1, Sigma) 
X2 = rmvnorm(n0, mu0, Sigma)
X = rbind(X1,X2)
Y = factor(c(rep(1,n1),rep(0,n0)))
df = data.frame(Y,X)
Sinv = solve(Sigma)
slope.vec = t(mu1-mu0) %*% Sinv
intercept = 0.5*(t(mu0) %*% Sinv %*% mu0 - t(mu1) %*% Sinv %*% mu1)
ggplot(df, aes(X1,X2,color=Y)) + geom_point() + scale_color_manual(values=c(blue,red)) +
  stat_ellipse(type='norm') + 
  geom_abline(intercept = -intercept/slope.vec[2], 
              slope = -slope.vec[1]/slope.vec[2], color=green)

Same one, but make n big

n1=500; n0=500
X1 = rmvnorm(n1, mu1, Sigma) 
X2 = rmvnorm(n0, mu0, Sigma)
X = rbind(X1,X2)
Y = factor(c(rep(1,n1),rep(0,n0)))
df = data.frame(Y,X)
Sinv = solve(Sigma)
slope.vec = t(mu1-mu0) %*% Sinv
intercept = 0.5*(t(mu0) %*% Sinv %*% mu0 - t(mu1) %*% Sinv %*% mu1)
ggplot(df, aes(X1,X2,color=Y)) + geom_point() + scale_color_manual(values=c(blue,red)) +
  stat_ellipse(type='norm') + 
  geom_abline(intercept = -intercept/slope.vec[2], 
              slope = -slope.vec[1]/slope.vec[2], color=green)

Same one, but change P(Y=1)

n1=75; n0=25
X1 = rmvnorm(n1, mu1, Sigma) 
X2 = rmvnorm(n0, mu0, Sigma)
X = rbind(X1,X2)
y = factor(c(rep(1,n1),rep(0,n0)))
df = data.frame(y,X)
Sinv = solve(Sigma)
slope.vec = t(mu1-mu0) %*% Sinv
intercept = 0.5*(t(mu0) %*% Sinv %*% mu0 - t(mu1) %*% Sinv %*% mu1) + log(.75) - log(.25)
ggplot(df, aes(X1,X2,color=y)) + geom_point() + scale_color_manual(values=c(blue,red)) +
  stat_ellipse(type='norm') + 
  geom_abline(intercept = -intercept/slope.vec[2], 
              slope = -slope.vec[1]/slope.vec[2], color=green)

Ok, how do you do it?

library(MASS)
lda.fit = lda(y~X1+X2, data=df) 
sl.int = lda.disc(lda.fit,df)
log.bd = decision.boundary(df)
truth = data.frame(intercept=-intercept/slope.vec[2], slope=-slope.vec[1]/slope.vec[2])
dfa = rbind(sl.int,log.bd,truth)
dfa$discriminant = c('lda','logistic','truth')
ggplot(df, aes(X1,X2,color=y)) + geom_point() + scale_color_brewer(palette = 'Set1')+
  stat_ellipse(type='norm') + 
  geom_abline(mapping=aes(intercept=intercept, slope=slope,color=discriminant),data=dfa)

Comparing LDA and Logistic regression

QDA

n1=50; n0=50
Sigma1 = matrix(c(2,.8,.8,1),2)
Sigma0 = matrix(c(1,-.5,-.5,2),2)
X1 = rmvnorm(n1, mu1, Sigma1) 
X2 = rmvnorm(n0, mu0, Sigma0)
X = rbind(X1,X2)
y = factor(c(rep(1,n1),rep(0,n0)))
df = data.frame(y,X)
qda.fit = qda(y~X1+X2, data=df)
lda.fit = lda(y~X1+X2, data=df)
pred.grid = expand.grid(X1=seq(min(df$X1),max(df$X1),len=100),
                        X2=seq(min(df$X2),max(df$X2),len=100))
pred.grid$qda = predict(qda.fit, newdata=pred.grid)$class
pred.grid$lda = predict(lda.fit, newdata=pred.grid)$class
pg = gather(pred.grid,key='key',value='predictions',-c(X1,X2))
ggplot(pg, aes(X1,X2)) + geom_raster(aes(fill=predictions)) +
  facet_wrap(~key) + scale_fill_brewer()+
  geom_point(data=df,mapping=aes(X1,X2,color=y)) +
  scale_color_brewer(palette = 'Set1')

KNN

Re-entry

library(class)
pred.grid = expand.grid(X1=seq(min(df$X1),max(df$X1),len=100),
                        X2=seq(min(df$X2),max(df$X2),len=100))
pred.grid$knn3 = knn(df[,-1], pred.grid, df$y, k=3)
ggplot(pred.grid, aes(X1,X2)) + geom_raster(aes(fill=knn3)) +
  scale_fill_brewer() + geom_point(data=df,mapping=aes(X1,X2,color=y)) +
                        scale_color_brewer(palette = 'Set1')

Choosing k

pred.grid$knn1 = knn(df[,-1], pred.grid[,1:2], df$y, k=1)
pred.grid$knn5 = knn(df[,-1], pred.grid[,1:2], df$y, k=5)
pred.grid$knn10 = knn(df[,-1], pred.grid[,1:2], df$y, k=10)
pred.grid$knn20 = knn(df[,-1], pred.grid[,1:2], df$y, k=20)
pg = gather(pred.grid,key='k',value='knn',-c(X1,X2))

ggplot(pg, aes(X1,X2)) + geom_raster(aes(fill=knn)) +
  facet_wrap(~k,labeller = label_both) + scale_fill_brewer()+
  geom_point(data=df,mapping=aes(X1,X2,color=y)) +
  scale_color_brewer(palette = 'Set1')

knn.cv

kmax = 20
err = double(kmax)
for(ii in 1:kmax){
  pk = knn.cv(df[,-1],df$y, k=ii) # does leave one out CV
  err[ii] = mean(pk != df$y)
}
ggplot(data.frame(k=1:kmax,error=err), aes(k,error)) + geom_point(color=red) +
  geom_line(color=red)

pred.grid$knn15 = knn(df[,-1], pred.grid[,1:2], df$y, k=15)
ggplot(pred.grid, aes(X1,X2)) + geom_raster(aes(fill=knn15)) +
  scale_fill_brewer() + geom_point(data=df,mapping=aes(X1,X2,color=y)) +
                        scale_color_brewer(palette = 'Set1')

(tt <- table(knn(df[,-1],df[,-1],df$y,k=15),df$y,dnn=c('predicted','truth')))
##          truth
## predicted  0  1
##         0 42  5
##         1  8 45
1-sum(diag(tt))/sum(tt)
## [1] 0.13

Kernelization

Other non-linear classifiers

The idea

The trick

If:

  1. There is a mapping \(\phi: \mathcal{X}\rightarrow\mathcal{Z}\)

  2. Your classifiers of choice only needs the inner product between observations, not the observations themselves. (logistic regression, LDA, and SVMs work)

  3. There is a function \(K\) such that \(K(x,x')=\langle\phi(x),\phi(x')\rangle\) (Mercer’s theorem gives this)

Then, we can just replace all inner products \(\langle x,x'\rangle\) with \(K(x,x')\).